EG started this on 20160403 how does the noise effect the loglikelihood Weibull model->theta is a scale, gamma is a shape parameter >0 Gompertz Model-> G is a scale, R is a shape(rate) parameter >0
difference function of loglikelihood function of gompertz and weibull p.d.fs test check how L(Weibull,X)-L(Gompertz,X) values changes for parameters Weibull model->theta is a scale, gamma is a shape parameter >0 Gompertz Model-> G is a scale, R is a shape parameter>0
For additive Gaussian noise e ~ N (0, sigma^2) with known variance sigma^2 sd of gaussian noise function max sd would be = 3*mean(inverse.gomp.CDF) min sd would be mean(inverse.gomp.CDF)
require(flexsurv)
## Loading required package: flexsurv
## Loading required package: survival
require(gplots)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
gamma=0.01
theta=0.25
#test G and R in nested for loops
R=0.001
G=0.2
#R should be in [0, 0.05], and G should be [0.05, 0.3].
N=2000 # population size
#where alpha and beta are 2 parameters, N is number of population
#generate gompertz random numbers by using inverse CDF
#generate random number with a given distribution of Gompertz
#prediction
rgompertz = function(N,G, R){
### I have a function to generate random-numbers from
### (using the 'inversion method')
return( (log(1-(G/R*log(1-runif(N)))))/G)
}
set.seed(123)
#generate gompertz random numbers (lifespan)
#prediction
gompertz.random<-rgompertz(N,0.05,0.001)
average.lifespan=mean(gompertz.random)
scale=1
#generate gaussion random numbers
set.seed(123)
gaussian<-rnorm(N, mean = 0, sd=sd(gompertz.random)*scale)
lifespan = gompertz.random+ gaussian
summary(lifespan)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -31.08 48.96 70.31 68.96 90.92 148.80
lifespan[lifespan < 0] <- 0
lifespan<-floor(lifespan+0.5)
lifespan <- lifespan[ lifespan != 0 ]
calculate.s = function( lifespan ){
myData = sort( lifespan[!is.na(lifespan)] );
tmpGC = table( myData )
for( i in 2:length(tmpGC)) {
tmpGC[i] = tmpGC[i-1] + tmpGC[i] }
tot = length(myData)
tmpGC = tmpGC / tot;
s = 1 - tmpGC
#list( s=s, t=unique(my.data));
ret = data.frame( cbind(s, unique(myData)));
names(ret) = c("s", "t");
ret;
}
GC = calculate.s(lifespan)
plot(GC$s ~ GC$t)
#HQin's calculate mortality rate function to calculate the rate of mortality over time
calculate.mortality.rate = function( lifespan ){
GC = calculate.s(lifespan)
GC$ds=NA; GC$dt=NA
#first point
GC$dt[1] = GC$t[2]
GC$ds[1] = 1 - GC$s[1]
GC$mortality.rate[1] = GC$ds[1] / GC$dt[1]
for( j in 2:length(GC[,1])) {
GC$ds[j] = GC$s[j-1] - GC$s[j]
GC$dt[j] = -GC$t[j-1] + GC$t[j]
GC$mortality.rate[j] = GC$ds[j] / ( GC$s[j] * GC$dt[j])
}
return(GC)
} #end of calculate.mortality.rate()
GC = calculate.mortality.rate(lifespan)
GC = calculate.s(round( lifespan, digits=1))
head(GC)
## s t
## 1 0.9989770 1
## 2 0.9964194 2
## 3 0.9943734 3
## 4 0.9928389 4
## 5 0.9897698 5
## 6 0.9867008 6
GC$ds=NA; GC$dt=NA
GC$dt[1] = GC$t[2] #20130321 correct a bug GC$s -> GC$t
GC$ds[1] = 1 - GC$s[1]
GC$mortality.rate[1] = GC$ds[1] / GC$dt[1]
for( j in 2:length(GC[,1])) {
GC$ds[j] = GC$s[j-1] - GC$s[j]
GC$dt[j] = -GC$t[j-1] + GC$t[j]
GC$mortality.rate[j] = GC$ds[j] / ( GC$s[j] * GC$dt[j])
}
plot( GC$s ~ GC$t)
plot( GC$mortality.rate ~ GC$t, typ='l', log='y' )
summary(lm(log10(GC$mortality.rate[1:(length(GC$mortality.rate)-1)]) ~ GC$t[1:(length(GC$t)-1)]))
##
## Call:
## lm(formula = log10(GC$mortality.rate[1:(length(GC$mortality.rate) -
## 1)]) ~ GC$t[1:(length(GC$t) - 1)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6475 -0.1032 0.0373 0.1005 1.2410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.6573283 0.0340717 -77.99 <2e-16 ***
## GC$t[1:(length(GC$t) - 1)] 0.0136375 0.0004088 33.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2035 on 141 degrees of freedom
## Multiple R-squared: 0.8876, Adjusted R-squared: 0.8868
## F-statistic: 1113 on 1 and 141 DF, p-value: < 2.2e-16
#then plot log(mortality rate) ~ time.
#For Weibull model, log(moretality rate) ~ log(time) give the linear form.
#pdf(paste("plots/","Gompertz.semi.log.plot.batch.pdf", sep=''), width=5, height=5)
plot( log10(GC$mortality.rate) ~ GC$t, type='l') #linear for Gompertz, semi-log plot
text(10,0,"R2= 0.89")
#dev.off()
summary(lm(log10(GC$mortality.rate[1:(length(GC$mortality.rate)-1)]) ~ log10(GC$t[1:(length(GC$t)-1)])))
##
## Call:
## lm(formula = log10(GC$mortality.rate[1:(length(GC$mortality.rate) -
## 1)]) ~ log10(GC$t[1:(length(GC$t) - 1)]))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54713 -0.15443 -0.05172 0.10854 1.70016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.94292 0.10112 -38.99 <2e-16 ***
## log10(GC$t[1:(length(GC$t) - 1)]) 1.31082 0.05682 23.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2778 on 141 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.789
## F-statistic: 532.1 on 1 and 141 DF, p-value: < 2.2e-16
#pdf(paste("plots/","Weibull.log.log.plot.batch.pdf", sep=''), width=5, height=5)
plot( log10(GC$mortality.rate) ~ log10(GC$t), type='l' ) #linear for Weibull, log-log plot
text(0.75,0,"R2=0.79")
#dev.off()
#generate gompertz random numbers (lifespan)
#prediction
gompertz.random<-rgompertz(N,G,R)
average.lifespan=mean(gompertz.random)
#create a data frame for fit parameters
fit_names = c( 'sd.rgompertz','current.seed','scale','sderr','Delta_LL','G','R','LWei','LGomp','MeanLF','Delta_LL.flex','LWei.flex','LGomp.flex','G.flex.estimated','R.flex.estimated','LLG.par','LLR.par','gamma.flex.estimated','theta.flex.estimated')
fit= data.frame(matrix(, nrow=length(fit_names), ncol=N)) #set up a skeleton table
names(fit) = c( "sd.rgompertz","current.seed","scale","sderr","Delta_LL","G","R","LWei","LGomp","MeanLF","Delta_LL.flex","LWei.flex","LGomp.flex","G.flex.estimated","R.flex.estimated","LLG.par","LLR.par","gamma.flex.estimated","theta.flex.estimated")
#log likelihood function for the Weibull model
#s = exp(-( theta*x)^gamma)
#f = gamma * theta^gamma *x^(gamma-1);
LL_wei<-function(param,y){
theta<- exp(param[1]) #take exponential to avoid NaNs when taking log(theta)
gamma<- exp(param[2]) # avoid NaNs when taking log(gamma)
data=lifespan[!is.na(lifespan)]
#log_s = -( theta*x)^gamma
#log_f = log(gamma)+ gamma*log(theta)+ (gamma-1)*log(x) ;
#w.lh = sum(log_s) + sum(log_f);
w.lh<- sum(log(gamma) + gamma*log(theta) + (gamma-1)*log(data)) - sum(
(theta*data)^gamma)
return(-w.lh)
}
# take log(param) since you take exponential above to avoid NaN values above
#log likelihood function of gompertz model
#s = exp( (R/G) *(1 - exp(G* data)) )
#f = R * exp( G * data );
LL_gomp<- function( param, y ) {
G = exp(param[1]); R = exp(param[2]);
data = lifespan[!is.na(lifespan)];
#log_s = (R/G) *(1 - exp(G* data))
#log_f = log(R) + G * data ;
g.lh = sum((R/G) *(1 - exp(G* data))) + sum(log(R) + G * data );
return(- g.lh)
}
for( seed in c(12345, 20160711, -1881, 9999.1234,300045,50758,-10000,74562,-92345,25434)) {
set.seed(seed)
current.seed<-seed
for (R_in in c(1E-3, 0.002,0.003, 0.005,0.008, 0.01,0.03, 0.05)){ #fix alpha or in other words R shape parameter
for (G_in in c(0.05,0.08, 0.1,0.12,0.15,0.17, 0.2, 0.25)){
#for (sd in seq(round.lifespan,3*round.lifespan,by=1)){
for (i in c(0,1,2,3,4,5)){
#generate gompertz random numbers (lifespan)
#prediction
gompertz.random<-rgompertz(N, G_in, R_in)
lifespan= gompertz.random + rnorm(N, mean=0, sd=sd(gompertz.random)*i)
lifespan[lifespan < 0] <- 0
lifespan<-floor(lifespan+0.5)
lifespan <- lifespan[ lifespan != 0 ]
summary(lifespan)
scale=i
sdrgompertz<-sd(gompertz.random)
average.lifespan=mean(lifespan)
#store average.lifespan into MeanLF list
MeanLF=average.lifespan
#Log likelihood function for the Weibull model
#calculate noise change
sderr = sd(gompertz.random)*i
G=G_in
R=R_in
weib=optim(log(c(3,0.03)),LL_wei,y=lifespan)
LWei = - weib$value
gomp<-optim(param<-log(c(G_in,R_in)), fn=LL_gomp, y=lifespan)
LGomp= -gomp$value
# store R and G estimation from optim of likl functions in Gompertz
LLG.par =gomp$par[1]
LLR.par=gomp$par[2]
delta.likelihood<- -weib$value-(-gomp$value)
#Delta_LL[[length(Delta_LL)+1]] = delta.likelihood.wei
#flexsurv to calculate the log-likelihood value for both models
#flexsurv only works with positive variables.
fitGomp = flexsurvreg(formula = Surv(lifespan) ~ 1, dist="gompertz")
fitWei = flexsurvreg(formula = Surv(lifespan) ~ 1, dist="weibull")
LWei.flex=fitWei$loglik
LGomp.flex=fitGomp$loglik
param.Gomp<-fitGomp$res; R.flex<-param.Gomp[2]; G.flex<-param.Gomp[1];
R.flex.estimated<-R.flex
G.flex.estimated<-G.flex
param.Wei<-fitWei$res; gamma.flex<-param.Wei[1]; theta.flex<-param.Wei[2];
gamma.flex.estimated<-gamma.flex;
theta.flex.estimated<-theta.flex
delta_flexsurv=fitWei$loglik-fitGomp$loglik
#fitWei$loglik
Delta_LL.flex=delta_flexsurv
#sim_names = c( "sd.rgompertz","scale","sderr","Delta_LL","G","R","LWei","LGomp","MeanLF","Delta_LL.flex","LWei.flex","LGomp.flex","G.flex.estimated","R.flex.estimated","LLG.par","LLR.par","gamma.flex.estimated","theta.flex.estimated")
fit = rbind(fit,c(sdrgompertz,current.seed,scale,sderr,delta.likelihood,G,R,LWei,LGomp,MeanLF,Delta_LL.flex,LWei.flex,LGomp.flex,G.flex.estimated,R.flex.estimated,LLG.par,LLR.par,gamma.flex.estimated,theta.flex.estimated))
#write.csv(fit, file="Results.csv", row.names=F)
fit= fit[!is.na(fit[,1]), ]
}
}
}
}
fit<- fit[!is.na(names(fit))]
write.csv(fit, file="Results.csv", row.names=F)
new_fit<-fit[c(1:length(fit[fit$current.seed==12345,5])),]
for (i in 1:length(fit[fit$current.seed==12345,5])){
new_fit$Delta_LL[i]<-(fit[fit$current.seed==12345,5][i]+fit[fit$current.seed==20160711,5][i]+
fit[fit$current.seed==-1881,5][i]+fit[fit$current.seed==9999.1234,5][i]+fit[fit$current.seed== 300045,5][i]
+fit[fit$current.seed==50758,5][i]+fit[fit$current.seed==-10000,5][i]+fit[fit$current.seed== 74562,5][i]+
fit[fit$current.seed== -92345,5][i]+fit[fit$current.seed==25434,5][i])/10
}
summary( lm( new_fit$Delta_LL~ new_fit$Delta_LL.flex))
##
## Call:
## lm(formula = new_fit$Delta_LL ~ new_fit$Delta_LL.flex)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.563 -8.911 -1.738 4.999 128.279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.66243 1.00025 2.662 0.0081 **
## new_fit$Delta_LL.flex 0.92260 0.02292 40.260 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.33 on 382 degrees of freedom
## Multiple R-squared: 0.8093, Adjusted R-squared: 0.8088
## F-statistic: 1621 on 1 and 382 DF, p-value: < 2.2e-16
summary( lm( new_fit$LGomp ~ new_fit$LGomp.flex))
##
## Call:
## lm(formula = new_fit$LGomp ~ new_fit$LGomp.flex)
##
## Residuals:
## Min 1Q Median 3Q Max
## -284.503 1.883 2.474 3.128 4.805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.743158 7.236998 0.379 0.705
## new_fit$LGomp.flex 1.000807 0.001078 928.053 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.9 on 382 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9996
## F-statistic: 8.613e+05 on 1 and 382 DF, p-value: < 2.2e-16
summary( lm( new_fit$LWei ~ new_fit$LWei.flex))
##
## Call:
## lm(formula = new_fit$LWei ~ new_fit$LWei.flex)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.044e-03 -1.430e-05 2.632e-05 5.246e-05 9.621e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.579e-05 3.595e-05 -4.390e-01 0.661
## new_fit$LWei.flex 1.000e+00 5.344e-09 1.871e+08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0001047 on 382 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.501e+16 on 1 and 382 DF, p-value: < 2.2e-16
summary(lm(new_fit$G~new_fit$G.flex.estimated))
##
## Call:
## lm(formula = new_fit$G ~ new_fit$G.flex.estimated)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.086906 -0.040857 -0.009407 0.035264 0.127737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.110645 0.004138 26.736 <2e-16 ***
## new_fit$G.flex.estimated 0.504566 0.051985 9.706 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05536 on 382 degrees of freedom
## Multiple R-squared: 0.1978, Adjusted R-squared: 0.1957
## F-statistic: 94.21 on 1 and 382 DF, p-value: < 2.2e-16
summary(lm(new_fit$R~new_fit$R.flex.estimated))
##
## Call:
## lm(formula = new_fit$R ~ new_fit$R.flex.estimated)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.020091 -0.006279 -0.000446 0.005374 0.036638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0062347 0.0008815 -7.073 7.29e-12 ***
## new_fit$R.flex.estimated 1.1234927 0.0415438 27.044 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.009556 on 382 degrees of freedom
## Multiple R-squared: 0.6569, Adjusted R-squared: 0.656
## F-statistic: 731.4 on 1 and 382 DF, p-value: < 2.2e-16
#pdf(paste("plots/","simulated.G.vs.estimated.G.flex.batch.pdf", sep=''), width=5, height=5)
plot(new_fit$G~new_fit$G.flex.estimated)
#dev.off()
#pdf(paste("plots/","simulated.R.vs.estimated.R.flex.batch.pdf", sep=''), width=5, height=5)
plot(new_fit$R~new_fit$R.flex.estimated)
#dev.off()
n.col=256 # number of colors
cm = redblue(n.col) # red-white-blue colormap
mmx = min(abs(min(new_fit$Delta_LL)),abs(max(new_fit$Delta_LL))) # find min value, or you can set to a number
colbr <- c(seq(-mmx/2,mmx/2, len=length(cm)+1)) # vector of symmetric breaks
R.els = unlist( unique(new_fit$R))
colnum = length(R.els)
tmp = unlist( unique(new_fit$scale))
scale.els = tmp[order(tmp)]
rownum = length(scale.els)
mat = matrix( data=NA, nrow= rownum, ncol=colnum) #noise as row, alpha as columns
rownames(mat) = scale.els
colnames(mat) = R.els
for (k in c(0.05,0.08, 0.1,0.12,0.15,0.17, 0.2, 0.25)){
data = new_fit[new_fit[,6]==k, 5]
data<-unlist(data)
heat_mat<-matrix(data,ncol=colnum,nrow=rownum)
rownames(heat_mat) <- scale.els
colnames(heat_mat) <- R.els
library(gplots)
hM <- format(round(heat_mat, 1))
#data_mat<-scale(heat_mat,scale=TRUE,center=FALSE)
#jpeg(paste("plots/",k, ".fixed.G.jpg", sep=''))
heatmap.2(heat_mat, scale="none",cellnote=hM,col = cm, breaks=colbr, notecol="black", margins=c(5,10),
dendrogram='none', Rowv=FALSE, Colv=FALSE,trace='none',key=TRUE,symkey=F, density.info="histogram",
xlab = "R parameters",
ylab = "scale", main = bquote(paste("R vs. scale for fixed" ~ G==.(k))),par(cex.main=.5),srtCol=315, srtRow=315, adjCol = c(0,1),cexRow=0.8,cexCol=0.8)
#dev.off()
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
G.els = unlist( unique(new_fit$G))
colnum = length(G.els)
R.els=unlist(unique(new_fit$R))
rownum = length(R.els)
mat = matrix( data=NA, nrow= rownum, ncol=colnum) #noise as row, alpha as columns
rownames(mat) = R.els
colnames(mat) = G.els
for (n in c(0,1,2,3,4,5) ){
data = new_fit[new_fit[,3]==n, 5]
data<-unlist(data)
heat_mat<-matrix(data,ncol=colnum,nrow=rownum)
heat_mat<-t(heat_mat)
rownames(heat_mat) <- R.els
colnames(heat_mat) <- G.els
library(gplots)
hM <- format(round(heat_mat, 1))
#data_mat<-scale(heat_mat,scale=TRUE,center=FALSE)
#jpeg(paste("plots/",n, ".fixed_scale.jpg", sep=''))
heatmap.2(heat_mat, cellnote=hM,col = cm, breaks=colbr,scale="none", density.info="histogram",notecol="black", margins=c(5,10),
dendrogram='none', Rowv=FALSE, Colv=FALSE,trace='none',key=TRUE,
xlab = "G parameters",
ylab = "R parameters", main = bquote(paste("G vs. R for fixed" ~ scale==.(n))),par(cex.main=.5),srtCol=315, srtRow=315, adjCol = c(0,1),cexRow=0.8,cexCol=0.8)
#dev.off()
}
## NULL
## NULL
## Warning in image.default(z = matrix(z, ncol = 1), col = col, breaks =
## tmpbreaks, : unsorted 'breaks' will be sorted before use
## NULL
## Warning in image.default(z = matrix(z, ncol = 1), col = col, breaks =
## tmpbreaks, : unsorted 'breaks' will be sorted before use
## NULL
## Warning in image.default(z = matrix(z, ncol = 1), col = col, breaks =
## tmpbreaks, : unsorted 'breaks' will be sorted before use
## NULL
## Warning in image.default(z = matrix(z, ncol = 1), col = col, breaks =
## tmpbreaks, : unsorted 'breaks' will be sorted before use
## NULL
G.els = unlist( unique(new_fit$G))
colnum = length(G.els)
tmp = unlist( unique(new_fit$scale))
scale.els = tmp[order(tmp)]
rownum = length(scale.els)
mat = matrix( data=NA, nrow= rownum, ncol=colnum) #noise as row, alpha as columns
colnames(mat) = G.els
rownames(mat) = scale.els
for (j in c(1E-3, 0.002,0.003, 0.005,0.008, 0.01,0.03, 0.05) ){
data = new_fit[new_fit[,7]==j, 5]
data<-unlist(data)
heat_mat<-matrix(data,ncol=colnum,nrow=rownum)
#rownames(heat_mat, do.NULL = TRUE, prefix = "row")
rownames(heat_mat) <- scale.els
colnames(heat_mat) <- G.els
library(gplots)
hM <- format(round(heat_mat, 1))
#data_mat<-scale(heat_mat,scale=TRUE,center=FALSE)
#paste(file = "~/github/model.comparison/plots/heatplot_zero_noise_G",k,".jpeg",sep="")
#jpeg(paste("plots/",j, ".fixed_R.jpg", sep=''))
heatmap.2(heat_mat, cellnote=hM,col = cm, breaks=colbr, scale="none", notecol="black", margins=c(5,10),
dendrogram='none', density.info="histogram",Rowv=FALSE, Colv=FALSE,trace='none', key = T,
xlab = "G parameters",
ylab = "scale", main = bquote(paste("G vs. scale" ~ R==.(j))),par(cex.main=.5),srtCol=315, srtRow=315,adjCol = c(0,1),cexRow=0.8,cexCol=0.8)
#dev.off()
#jpeg(paste("plots/",j, ".fixed_R.dLLhist.jpg", sep=''))
#hist(heat_mat, main = bquote(paste("G vs. scale" ~ R==.(j))))
#dev.off()
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## Warning in image.default(z = matrix(z, ncol = 1), col = col, breaks =
## tmpbreaks, : unsorted 'breaks' will be sorted before use
## NULL
## Warning in image.default(z = matrix(z, ncol = 1), col = col, breaks =
## tmpbreaks, : unsorted 'breaks' will be sorted before use